IMPORTING DATASETS

setwd("C:/Users/shubh/Desktop/Data Science Using R/Class 7 Data Science Using R")

customer_data <- read.csv("R Credit Card Case Study/Customer Acqusition.csv")
customer_data$No <- NULL
spend_data <- read.csv("R Credit Card Case Study/spend.csv")
spend_data$Sl.No. <- NULL
repay_data <- read.csv("R Credit Card Case Study/Repayment.csv")
repay_data$SL.No. <- NULL

IMPORTING PACKAGES

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
require(ggplot2)
## Loading required package: ggplot2
require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

RENAMING COLUMNS FOR BETTER UNDERSTANDING

repay_data <- dplyr::rename(repay_data, "Amt_Repay" = "Amount") 
spend_data <- dplyr::rename(spend_data, "Amt_Spend" = "Amount")

CONVERSION TO PROPER DATE FORMATS

repay_data$Month <- lubridate::dmy(repay_data$Month)
spend_data$Month <- lubridate::dmy(spend_data$Month)

Q1(a). REPLACE AGE WITH MEAN WHERE AGE IS LESS THAN 18

customer_data[customer_data$Age < 18, "Age"]
## [1] 16 13 15 12 16 16
customer_data[customer_data$Age < 18, "Age"] <- mean(customer_data$Age, na.rm = TRUE)
customer_data[customer_data$Age < 18, "Age"]
## numeric(0)

Q1(b). IF AMT_SPEND > LIMIT REPLACE WITH 50% OF LIMIT

customer_spend <- dplyr::left_join(x = customer_data, y = spend_data, by = c("Customer" = "Customer") ) 
head(customer_spend, 5)
##   Customer Age      City Product  Limit Company       Segment      Month
## 1       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-12
## 2       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-03
## 3       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-15
## 4       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-25
## 5       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2005-01-17
##        Type Amt_Spend
## 1 JEWELLERY 485470.80
## 2     PETRO 410556.13
## 3   CLOTHES  23740.46
## 4      FOOD 484342.47
## 5    CAMERA 369694.07
head(customer_spend[customer_spend$Amt_Spend > customer_spend$Limit, "Amt_Spend"], 5)
## [1] 199554.7 376887.5 439648.5 384078.7 117089.2
customer_spend[customer_spend$Amt_Spend > customer_spend$Limit, "Amt_Spend"] <-  
0.5*customer_spend[customer_spend$Amt_Spend > customer_spend$Limit ,"Limit"]

head(customer_spend[customer_spend$Amt_Spend > customer_spend$Limit, "Amt_Spend"], 5)
## numeric(0)

Q1(c). IF AMT_REPAY > LIMIT REPLACE WITH LIMIT.

customer_repay <- dplyr::left_join(x = customer_data, y = repay_data, by = c("Customer" = "Customer"))
head(customer_repay, 5)
##   Customer Age      City Product  Limit Company       Segment      Month
## 1       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-12
## 2       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-03
## 3       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-15
## 4       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2004-01-25
## 5       A1  76 BANGALORE    Gold 500000      C1 Self Employed 2005-01-17
##   Amt_Repay
## 1  495414.8
## 2  245899.0
## 3  259490.1
## 4  437555.1
## 5  165972.9
head(customer_repay[customer_repay$Amt_Repay > customer_repay$Limit, "Amt_Repay"], 5)
## [1] 185955.1 412783.3 148038.7 179144.7 381635.7
customer_repay[customer_repay$Amt_Repay > customer_repay$Limit, "Amt_Repay"] <-  
customer_repay[customer_repay$Amt_Repay > customer_repay$Limit ,"Limit"]

head(customer_repay[customer_repay$Amt_Repay > customer_repay$Limit, "Amt_Repay"], 5)
## numeric(0)

Q2(a). NUMBER OF DISTINCT CUSTOMERS.

distinct_customers <- count(dplyr::distinct(customer_data))
print(paste("No. of distinct customers is", distinct_customers ))
## [1] "No. of distinct customers is 100"

Q2(b). NUMBER OF DISTINCT CATEGORIES(SEGMENTS).

count_categories <- count(dplyr::distinct(customer_data, Segment))
print(paste("No. of distinct categories is", count_categories ))
## [1] "No. of distinct categories is 5"

Q2(c). AVERAGE MONTHLY SPEND BY CUSTOMERS.

avg_spend_mth <- customer_spend%>% dplyr::group_by(lubridate::year(Month))%>%dplyr::summarise(AVG_SPEND = sum(Amt_Spend)/12)
## `summarise()` ungrouping output (override with `.groups` argument)
avg_spend_mth <- sum(avg_spend_mth$AVG_SPEND)/3
print(paste("Average Monthly Spend By All customers in a month is ", avg_spend_mth))
## [1] "Average Monthly Spend By All customers in a month is  6105030.14527778"

Q2(d). AVERAGE MONTHLY REPAYMENT BY CUSTOMERS.

avg_repay_mth <- customer_repay%>%dplyr::group_by(lubridate::year(Month))%>%dplyr::summarise(AVG_REPAY = sum(Amt_Repay)/12)
## `summarise()` ungrouping output (override with `.groups` argument)
avg_repay_mth <- sum(avg_repay_mth)/3
print(paste("Average Monthly Repayment By All Customers in a month is ", avg_repay_mth))
## [1] "Average Monthly Repayment By All Customers in a month is  6807070.55805555"

Q2(e). PROFIT FOR BANK FOR EACH MONTH.

monthly_spend <- customer_spend %>% 
                 dplyr::group_by(lubridate::year(Month), lubridate::month(Month)) %>%
                 summarise(Amt_Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'lubridate::year(Month)' (override with `.groups` argument)
monthly_spend <- dplyr::rename(monthly_spend, "Year" = "lubridate::year(Month)",
                                              "Month" = "lubridate::month(Month)")

monthly_repay <- customer_repay %>% 
                 dplyr::group_by(lubridate::year(Month), lubridate::month(Month)) %>%
                 summarise(Amt_Repay = sum(Amt_Repay))
## `summarise()` regrouping output by 'lubridate::year(Month)' (override with `.groups` argument)
monthly_repay <- dplyr::rename(monthly_repay, "Year" = "lubridate::year(Month)",
                                              "Month" = "lubridate::month(Month)")

monthly_profit <- left_join(monthly_repay, monthly_spend, by = c("Year" = "Year", "Month" = "Month"))
monthly_profit$Profit <- monthly_profit$Amt_Repay - monthly_profit$Amt_Spend
monthly_profit$Profit <- ifelse(monthly_profit$Profit > 0, monthly_profit$Profit * 0.29, 0)
monthly_profit
## # A tibble: 30 x 5
## # Groups:   Year [3]
##     Year Month Amt_Repay Amt_Spend  Profit
##    <dbl> <dbl>     <dbl>     <dbl>   <dbl>
##  1  2004     1 44792021. 44279812. 148541.
##  2  2004     2  3391538.  3835952.      0 
##  3  2004     3  2666519.  1871913. 230436.
##  4  2004     4  5268980.  3489415. 516074.
##  5  2004     5  3782760.  2146577. 474493.
##  6  2004     9  1902816.  2402110.      0 
##  7  2004    11  1788399.  1114989. 195289.
##  8  2005     1  4894040.  5383607.      0 
##  9  2005     2 25491800. 25928183.      0 
## 10  2005     4  3634904.  4100066.      0 
## # ... with 20 more rows

Q2(f). TOP 5 PRODUCT TYPES

top_5_products <- customer_spend %>% dplyr::group_by(customer_spend$Type) %>% 
                  summarise(Frequency = n()) %>% arrange(desc(Frequency)) %>% head(5)
## `summarise()` ungrouping output (override with `.groups` argument)
print(top_5_products)
## # A tibble: 5 x 2
##   `customer_spend$Type` Frequency
##   <chr>                     <int>
## 1 PETRO                       200
## 2 CAMERA                      160
## 3 FOOD                        160
## 4 AIR TICKET                  147
## 5 TRAIN TICKET                132

Q2(g). CITY HAVING MAXIMUM SPEND

city_max_spend <- customer_spend %>% dplyr::group_by(customer_spend$City) %>%
                  summarise(Total_Spend = sum(Amt_Spend)) %>% arrange(desc(Total_Spend)) %>%
                  head(1)
## `summarise()` ungrouping output (override with `.groups` argument)
print(city_max_spend)
## # A tibble: 1 x 2
##   `customer_spend$City` Total_Spend
##   <chr>                       <dbl>
## 1 COCHIN                  45963514.

Q2(h). EXPENDITURE ON BASIS OF AGE GROUP

customer_spend$Age_Group <- ifelse(customer_data$Age > 65, "Old(>65)", 
                                  ifelse(customer_data$Age > 50, "Senior(51-65)",
                                         ifelse(customer_data$Age > 30, "Middle Age(31-50)",
                                                ifelse(customer_data$Age >= 18, "Young(18-30)"))))
age_group_spend <- customer_spend %>% dplyr::group_by(Age_Group) %>% 
                   summarise(Total_Spend = sum(Amt_Spend)) %>% arrange(desc(Total_Spend))
## `summarise()` ungrouping output (override with `.groups` argument)
print(age_group_spend)
## # A tibble: 4 x 2
##   Age_Group         Total_Spend
##   <chr>                   <dbl>
## 1 Middle Age(31-50)   85833092.
## 2 Senior(51-65)       56882251.
## 3 Old(>65)            43999713.
## 4 Young(18-30)        33066029.

Q2.(i) TOP 10 CUSTOMERS IN TERMS OF REPAYMENT.

top_10_repayments <- customer_repay %>% dplyr::group_by(Customer) %>% 
                     summarise(Total_Repayment = sum(Amt_Repay)) %>% 
                     arrange(desc(Total_Repayment)) %>% head(10)
## `summarise()` ungrouping output (override with `.groups` argument)
print(top_10_repayments)
## # A tibble: 10 x 2
##    Customer Total_Repayment
##    <chr>              <dbl>
##  1 A61            10539143.
##  2 A60             9876291.
##  3 A13             9572001.
##  4 A43             8489871.
##  5 A45             8448335.
##  6 A12             8334760.
##  7 A14             7943269.
##  8 A44             7744730.
##  9 A39             7622483.
## 10 A42             7615461.

Q3. CITYWISE SPEND ON EACH PRODUCT ON YEARLY BASIS.

city_analysis <-customer_spend %>% dplyr::group_by(City, Type, lubridate::year(Month)) %>% 
                                   summarise(Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'City', 'Type' (override with `.groups` argument)
city_analysis <- dplyr::rename(city_analysis, "Year" = "lubridate::year(Month)")
city_analysis <- data.frame(city_analysis)
print(head(data.frame(city_analysis), 5))
##        City       Type Year     Spend
## 1 BANGALORE AIR TICKET 2004  749981.0
## 2 BANGALORE AIR TICKET 2005 1972296.9
## 3 BANGALORE AIR TICKET 2006 1311522.5
## 4 BANGALORE       AUTO 2005  141539.8
## 5 BANGALORE       AUTO 2006  822785.3
city_analysis_plot <- ggplot2::ggplot(data = city_analysis) + 
                               aes(x = City, y = Spend, fill = Type) + 
                               geom_bar(stat = "identity") + facet_grid(Year~.)
plot(city_analysis_plot)

city_analysis_plot <- plotly::ggplotly(city_analysis_plot)
city_analysis_plot

Q4(a). MONTHLY COMPARISON OF TOTAL SPENDS CITYWISE.

spend_citywise <- customer_spend %>% 
                  dplyr::group_by(lubridate::year(Month),lubridate::month(Month), City) %>%
                  summarise(Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'lubridate::year(Month)', 'lubridate::month(Month)' (override with `.groups` argument)
spend_citywise <- dplyr::rename(spend_citywise, "Year" = "lubridate::year(Month)",
                                                "Month" = "lubridate::month(Month)")
spend_citywise$Month <- lubridate::month(spend_citywise$Month, label = TRUE)
spend_citywise_plot <- ggplot2::ggplot(data = spend_citywise) + 
                                aes(x = City, y = Spend, fill = Month) + 
                                geom_bar(stat = "identity") + facet_grid(Year~.)
spend_citywise_plot <- plotly::ggplotly(spend_citywise_plot)
spend_citywise_plot

Q4(b). COMPARISON OF YEARLY SPEND ON AIR TICKET.

spend_airticket <- customer_spend[customer_spend$Type == "AIR TICKET", ] %>%
                                  dplyr::group_by(lubridate::year(Month)) %>% 
                                  dplyr::summarise(Yearly_Spend = sum(Amt_Spend)) 
## `summarise()` ungrouping output (override with `.groups` argument)
spend_airticket <- dplyr::rename(spend_airticket, "Year" = "lubridate::year(Month)")
spend_airticket_plot <- ggplot2::ggplot(data = spend_airticket) + 
                        aes(x = Year, y = Yearly_Spend) + 
                        geom_bar(stat = "identity")
spend_airticket_plot <- plotly::ggplotly(spend_airticket_plot)
spend_airticket_plot

Q4(c). COMPARISON OF MONTHLY SPEND OF FOR EACH PRODUCT.

product_spend <- customer_spend %>% 
                 dplyr::group_by(lubridate::year(Month), lubridate::month(Month), Type ) %>%
                 summarise(Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'lubridate::year(Month)', 'lubridate::month(Month)' (override with `.groups` argument)
product_spend <- dplyr::rename(product_spend, "Year" = "lubridate::year(Month)", 
                                              "Month" = "lubridate::month(Month)")
product_spend$Month <- lubridate::month(product_spend$Month, label = TRUE)
product_spend$Year <- as.factor(product_spend$Year)
product_spend_plot <- ggplot2::ggplot(data = product_spend) + 
                               aes(x = Month, y = Spend, fill = Year) + 
                               geom_bar(stat = "identity", position = "dodge") + 
                               facet_grid(Type~.)
product_spend_plot <- plotly::ggplotly(product_spend_plot)
product_spend_plot

Q5. USER DEFINED FUNCTION FOR ANALYSIS.

# CORRECTING A MINOR MISTAKE
customer_repay[customer_repay$Product == "Platimum", "Product"] <- "Platinum"

top_10_cust_repay <- function(product, period){
  
  data <- customer_repay[customer_repay$Product == product, ]
  
  if(period == "year"|period == "Year"|period == "Yearly"|period == "yearly"|period == "YEAR"|period == "YEARLY")
  {
    data_summary <- data %>% dplyr::group_by(Customer, City, lubridate::year(Month)) %>%
                                    summarise(Repayment = sum(Amt_Repay)) %>%
                                    arrange(City, desc(Repayment))
    
    data_summary <- dplyr::rename(data_summary, "Year" = "lubridate::year(Month)")
  } 
  
  else if(period == "month"|period == "monthly"|period == "MONTH"|period == "MONTHLY"|period == "Month"|period == "Monthly")
  {
    data_summary <- data %>% dplyr::group_by(Customer, City, lubridate::month(Month)) %>%
                                    summarise(Repayment = sum(Amt_Repay)) %>%
                                    arrange(City, desc(Repayment))
    
    data_summary <- dplyr::rename(data_summary, "Month" = "lubridate::month(Month)")
  }
  
  View(data_summary)
  
  data_bangalore <- head(data_summary[data_summary$City == "BANGALORE", ], 10)
  data_bombay <- head(data_summary[data_summary$City == "BOMBAY", ], 10)
  data_calcutta <- head(data_summary[data_summary$City == "CALCUTTA", ], 10)
  data_chennai <- head(data_summary[data_summary$City == "CHENNAI", ], 10)
  data_cochin <- head(data_summary[data_summary$City == "COCHIN", ], 10)
  data_delhi <- head(data_summary[data_summary$City == "DELHI", ], 10)
  data_patna <- head(data_summary[data_summary$City == "PATNA", ], 10)
  data_trivandrum <- head(data_summary[data_summary$City == "TRIVANDRUM", ], 10)
  
  final_data <- rbind(data_bangalore, data_bombay, data_calcutta, data_chennai, data_cochin, data_delhi, data_patna, data_trivandrum)
  
  if(period == "year")
  {
    final_data <- final_data %>% dplyr::group_by(Customer, City) %>% summarise %>% arrange(City)
  }
  
  else if(period == "month")
  {
    final_data <- final_data %>% dplyr::group_by(Customer, City) %>% summarise %>% arrange(City)
  }
  
  final_data
}


top_10_cust_repay("Gold", "month")
## `summarise()` regrouping output by 'Customer', 'City' (override with `.groups` argument)
## `summarise()` regrouping output by 'Customer' (override with `.groups` argument)
## # A tibble: 25 x 2
## # Groups:   Customer [25]
##    Customer City     
##    <chr>    <chr>    
##  1 A1       BANGALORE
##  2 A13      BANGALORE
##  3 A14      BANGALORE
##  4 A30      BANGALORE
##  5 A43      BANGALORE
##  6 A12      BOMBAY   
##  7 A17      BOMBAY   
##  8 A42      BOMBAY   
##  9 A62      BOMBAY   
## 10 A15      CALCUTTA 
## # ... with 15 more rows
top_10_cust_repay("Platinum", "year")
## `summarise()` regrouping output by 'Customer', 'City' (override with `.groups` argument)
## `summarise()` regrouping output by 'Customer' (override with `.groups` argument)
## # A tibble: 24 x 2
## # Groups:   Customer [24]
##    Customer City     
##    <chr>    <chr>    
##  1 A19      BANGALORE
##  2 A37      BANGALORE
##  3 A5       BANGALORE
##  4 A52      BANGALORE
##  5 A33      BOMBAY   
##  6 A36      BOMBAY   
##  7 A4       BOMBAY   
##  8 A51      BOMBAY   
##  9 A20      CALCUTTA 
## 10 A34      CALCUTTA 
## # ... with 14 more rows